#This script plots the cosine similarity of the opposition minus support vector
#It does this for each country and the largest version (sample size)
#It varies the number of target words used sampled for each year-week
#It produces Figure D2 in the paper
library(ggplot2)
library(ggthemes)
library(dplyr)

# Define the list of countries
countries <- c("djazairess", "maghress", "masress", "sauress", "turess")
# Versions to process based on weights
sample_sizes <- c(2,5,10,15,20,50,100)
versions <- paste0(sample_sizes, "targwords")

# Iterate through both versions and read the corresponding data
all_data <- bind_rows(lapply(versions, function(version) {
  bind_rows(lapply(countries, function(country) {
    cos_simsdf <-
      readRDS(paste0("data/output/cos_sims/", country, "/", "cos_simsdf_all", version, ".rds"))
    cos_simsdf %>%
      mutate(group = as.Date(group)) %>%
      arrange(group) %>%
      rename(yearwk = group,
             cos_sim = val) %>%
      mutate(country = country, 
             version = version) # Add columns for country and version
  }))
}))

country_map <- c(
  djazairess = "Algeria", 
  maghress = "Morocco", 
  masress = "Egypt", 
  sauress = "Saudi Arabia", 
  turess = "Tunisia"
)

all_data <- all_data %>%
  mutate(country_name = country_map[country])

all_data$country_name <- factor(all_data$country_name, levels = c("Egypt", "Tunisia", "Algeria", "Morocco", "Saudi Arabia"))

# # Order of versions
# ordered_versions <- c("150000_weight10", "1500000_weight50", "1500000_weight100", "1500000_weight500", "1500000_weight1000", "1500000_weight2000")
# # Convert 'version' to a factor with specified order
# all_data$version <- factor(all_data$version, levels = ordered_versions)

# Define versions and their desired order
versions <- c("2targwords", "5targwords", "10targwords", "15targwords", "20targwords", "50targwords", "100targwords")

# Extract numeric part from the version strings
all_data$numeric_version <- as.numeric(gsub("[^0-9]", "", all_data$version))

# Define colors for each numeric version (adjust the number of colors as needed)
colors_for_numeric_versions <- c("#E69F00", "#56B4E9", "#009E73", "#F0E442", "#0072B2", "#D55E00", "#CC79A7")


ggplot(all_data, aes(x = yearwk, y = cos_sim, col = as.factor(numeric_version))) +
  geom_point(alpha = 0.1, size = 2) +
  geom_smooth(method = "loess", size = 1.5, span = .5, se = F) +
  ylim(-.2, 0.2) +
  theme_tufte(base_family = "Helvetica") +
  labs(x = "Year-week", 
       y = "Cosine similarity, leaders : opposition index",
       color = "# Target words") + # Label for the color legend
  scale_color_manual(values = colors_for_numeric_versions, 
                     name = "# Target words",
                     labels = levels(as.factor(all_data$numeric_version))) +
  theme(
    legend.position = "bottom",
    axis.text.x = element_text(size = 20),
    axis.text.y = element_text(size = 20),
    axis.title.x = element_text(size = 15),
    axis.title.y = element_text(size = 15),
    legend.text = element_text(size = 15),
    legend.title = element_text(size = 15),
    panel.border = element_rect(colour = "black", fill = NA, size = 1),
    plot.background = element_rect(fill = "white", colour = NA),
    panel.grid.major = element_line(size = 0.1, linetype = "solid"),
    panel.grid.minor = element_line(size = 0.1, linetype = "solid"),
    strip.text = element_text(size = 20)
  ) +
  guides(color = guide_legend(ncol = length(versions))) +  # Set legend items in a single row
  facet_wrap(~country_name, ncol = 5)

ggsave(
  "plots/figD2.png",
  units = "in",
  width = 20,
  height = 5,
  dpi = 300
)